measure.classification = function(class.pred, class.obs, conf.mx){
entropy = 0
# for(i in levels(class.pred)){
# entropy = entropy-sum(log(prob.class[class.obs==i,colnames(prob.class)==i])) }
avg.accuracy=0
for (i in 1:ncol(conf.mx)){
avg.accuracy = avg.accuracy + (conf.mx[i,i]+sum(conf.mx[-i,-i]))/sum(conf.mx)
}
avg.accuracy=avg.accuracy/i
precision = mean(diag(conf.mx)/rowSums(conf.mx))
recall = mean(diag(conf.mx)/colSums(conf.mx))
F1.score = 2*precision*recall/(precision+recall)
return(data.frame(avg.accuracy=avg.accuracy,F1.score=F1.score))
}
ally. About 17.9 million lives die each year and this is about 31% of all deaths worldwide. Heart failure is a common event caused by CVDs and the dataset train.csv contains 10 features that can be used to predict mortality by heart failure.
setwd("/Users/tarineccleston/Documents/Masters/STATS 762/regression-for-DS/disease-and-weather/")
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(klaR)
library(e1071)
library(lubridate)
library(splines)
ables are numeric? Identify them and modify data to read explana- tory variables correctly.
cvd_df = read_csv("data/train.csv")
## Rows: 299 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): age, anaemia, creatinine_phosphokinase, diabetes, ejection_fractio...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(cvd_df)
## Rows: 299
## Columns: 11
## $ age <dbl> 75, 55, 65, 50, 65, 90, 75, 60, 65, 80, 75, 6…
## $ anaemia <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ creatinine_phosphokinase <dbl> 582, 7861, 146, 111, 160, 47, 246, 315, 157, …
## $ diabetes <dbl> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ ejection_fraction <dbl> 20, 38, 20, 20, 20, 40, 15, 60, 65, 35, 38, 2…
## $ high_blood_pressure <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ platelets <dbl> 265000, 263358, 162000, 210000, 327000, 20400…
## $ serum_sodium <dbl> 130, 136, 129, 137, 116, 132, 137, 131, 138, …
## $ sex <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, …
## $ smoking <dbl> 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, …
## $ death <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
Nominal: anaemia, diabetes, high_blood_pressure, sex, smoking, death Numeric: age, creatinine_phosphokinase, ejection_fraction, platelets, serum_sodium
# force to factor
cvd_df$death = as.factor(cvd_df$death)
cvd_df$anaemia = as.factor(cvd_df$anaemia)
cvd_df$diabetes = as.factor(cvd_df$diabetes)
cvd_df$high_blood_pressure = as.factor(cvd_df$high_blood_pressure)
cvd_df$sex = as.factor(cvd_df$sex)
cvd_df$smoking = as.factor(cvd_df$smoking)
Write the reason for your model choice.
NOTE: d) find best classifier NOTE: Compare the best between b and d NOTE: Make stronger comparison between d and b NOTE: Maybe use age and ejection fraction one the best model in d and then compare NOTE: Find best model in b then find the best model in d and compare NOTE: Fix scaling
# use 80/20 train/test split for an unbiased estimate
index = sample(nrow(cvd_df), as.integer(nrow(cvd_df)*0.8), replace = FALSE)
cvd_train_df = cvd_df[index, ]
cvd_test_df = cvd_df[-index, ]
# fit model
cvd_log_fit = glm(death ~ ., data = cvd_train_df, family = 'binomial')
death_probs = data.frame(probs = predict(cvd_log_fit, cvd_test_df, type="response"))
death_pred_logistic = ifelse(death_probs > 0.5, 1, 0)
conf_mx_logistic = table(death_pred_logistic, cvd_test_df$death)
conf_mx_logistic
##
## death_pred_logistic 0 1
## 0 35 17
## 1 4 4
metric_comparison = data.frame()
metrics_logistic = measure.classification(death_pred_logistic, cvd_test_df$death, conf_mx_logistic)
rownames(metrics_logistic) = "Logistic Regression"
metric_comparison = rbind(metric_comparison, metrics_logistic)
Fit logistic regression first as this is the simplest classification model and can be used as a benchmark. We are predicting the likelihood of death (0 or 1) given all other explanatory variables. Probabilities above 0.5 result in death, otherwise anything below 0.5 will result in living.
cvd_dt_fit <- rpart(death ~ ., data = cvd_train_df, method = "class", cp=0.001)
cvd_dt_fit$cptable
## CP nsplit rel error xerror xstd
## 1 0.200000000 0 1.0000000 1.0000000 0.09565162
## 2 0.120000000 1 0.8000000 0.8266667 0.09034877
## 3 0.026666667 2 0.6800000 0.6933333 0.08504764
## 4 0.008888889 5 0.6000000 0.7866667 0.08887959
## 5 0.001000000 8 0.5733333 0.8533333 0.09127436
Tree No. 4 provides the best accuracy with a dip in xerror. Find the parsimonious model by adding the standard error, resulting in tree no. 3 with 2 splits.
cp.1se= max(cvd_dt_fit$cptable[cvd_dt_fit$cptable[,4]<sum(cvd_dt_fit$cptable[which.min(cvd_dt_fit$cptable[,4]),c(4,5)]),1])
cp.1se
## [1] 0.02666667
cvd_dt_prune_fit <-prune(cvd_dt_fit, cp = cp.1se)
cvd_dt_prune_fit
## n= 239
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 239 75 0 (0.6861925 0.3138075)
## 2) ejection_fraction>=22.5 220 58 0 (0.7363636 0.2636364)
## 4) age< 79 199 43 0 (0.7839196 0.2160804) *
## 5) age>=79 21 6 1 (0.2857143 0.7142857) *
## 3) ejection_fraction< 22.5 19 2 1 (0.1052632 0.8947368) *
death_pred_dt_pruned = predict(cvd_dt_prune_fit, cvd_test_df, type="class")
conf_mx_dt_pruned = table(death_pred_dt_pruned, cvd_test_df$death)
conf_mx_dt_pruned
##
## death_pred_dt_pruned 0 1
## 0 36 16
## 1 3 5
metrics_dt = measure.classification(death_pred_dt_pruned, cvd_test_df$death, conf_mx_dt_pruned)
rownames(metrics_dt) = "Decision Tree"
metric_comparison = rbind(metric_comparison, metrics_dt)
Decision Tree performs worse. This could be due to use discretion of the decision space, which could result in a loss of information from other variables.
cvd_rf_fit = randomForest(death ~ ., data = cvd_train_df, importance=TRUE)
plot(cvd_rf_fit)
death_rf_pred = predict(cvd_dt_fit, cvd_test_df, type="class")
conf_mx_rf = table(death_rf_pred, cvd_test_df$death)
conf_mx_rf
##
## death_rf_pred 0 1
## 0 30 11
## 1 9 10
metrics_rf = measure.classification(death_rf_pred, cvd_test_df$death, conf_mx_rf)
rownames(metrics_rf) = "Random Forest"
metric_comparison = rbind(metric_comparison, metrics_rf)
death due to heart failure? List the most important two variables
summary(cvd_log_fit)
##
## Call:
## glm(formula = death ~ ., family = "binomial", data = cvd_train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7018 -0.8040 -0.4427 0.8417 2.6494
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.432e+00 4.998e+00 1.687 0.0916 .
## age 6.320e-02 1.454e-02 4.346 1.38e-05 ***
## anaemia1 8.662e-02 3.319e-01 0.261 0.7941
## creatinine_phosphokinase 2.876e-04 1.539e-04 1.868 0.0618 .
## diabetes1 3.556e-01 3.324e-01 1.070 0.2847
## ejection_fraction -8.174e-02 1.841e-02 -4.439 9.04e-06 ***
## high_blood_pressure1 4.470e-01 3.368e-01 1.327 0.1845
## platelets -1.264e-06 1.785e-06 -0.708 0.4790
## serum_sodium -7.550e-02 3.671e-02 -2.056 0.0397 *
## sex1 -7.290e-02 3.882e-01 -0.188 0.8510
## smoking1 -3.178e-01 3.865e-01 -0.822 0.4110
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 297.37 on 238 degrees of freedom
## Residual deviance: 238.53 on 228 degrees of freedom
## AIC: 260.53
##
## Number of Fisher Scoring iterations: 5
Age and ejection fraction appear to have the smallest p-values in the summary table, they appear to be the two most useful variables to predict death due to heart failure. However we cannot rely on this statistic alone, so we use variable important from the decision tree.
dt_important = data.frame(importance = cvd_dt_fit$variable.importance)
ranking = dt_important %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(importance) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(ranking) +
geom_col(aes(x = variable, y = importance),
col = "black", show.legend = F) +
coord_flip() +
scale_fill_grey() +
theme_bw()
A decision tree works by selection the variable and decision boundary would would provide the greater information gain in separating classes death (1) and not death (0). Variables of higher importance are use for splits higher up on the decision tree. Looking at the variable importance graph, we can confirm that ejection fraction and age are the two most important variables to predict death caused by cvd.
only applicable to numeric explanatory variables. If we consider only numeric explanatory variables, do we get a better classifer? Compare classifiers using the same validation method in (b) and answer the question
Use now only age, creatinine_phosphokinase, ejection_fraction, platelets, serum_sodium in our model. We can now use LDA, QDA and SVM on-top of decision trees, random forests and boosting methods. Let’s compare their relative performances using the aforementioned evaluation metrics. Let’s do an EDA first…
# standardise covariates for the whole dataset for the EDA
exclude_cols <- c("death", "anaemia", "diabetes", "high_blood_pressure", "sex", "smoking")
cvd_df[, -which(names(cvd_df) %in% exclude_cols)] <- scale(cvd_df[, -which(names(cvd_df) %in% exclude_cols)])
legend_colors = c("red", "blue")
{
pairs(cvd_df[c("age", "creatinine_phosphokinase", "ejection_fraction", "platelets", "serum_sodium")],
col = ifelse(cvd_df$death == 1, "red", "blue"),
main = "Pairs Plot of Explanatory Variables and Death")
legend("topright", legend = c("Death", "Survived"), col = legend_colors, pch = 1)
}
class_freq = table(cvd_df$death)
class_labels = recode(names(class_freq), "1" = "death", "0" = "survived")
ggplot(data.frame(Class = class_labels, Frequency = as.vector(class_freq)), aes(x = Class, y = Frequency, fill = Class)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("red", "blue")) +
xlab("Class") +
ylab("Frequency") +
ggtitle("Class Imbalance in the Dataset")
There appears to be a somewhat clear separation vertical linear boundary between death and survival at the lower end of ejection fraction compared to all other explanatory variables.
Observations with a greater age appear to have a higher chance of dying due to CVD.
Platelets and serum sodium vs other explanatory variables appear to have a appear to have slight quadratic decision boundaries between death and survival.
No notable blobs in the pair plot.
No distinguishable distribution or clusters between death and surviving classes for all pairs of explanatory variables.
There appears to be a class imbalance between survived and dead observations, with the survived class having twice more observations.
cvd_lda_fit = lda(death ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, cvd_train_df)
cvd_lda_fit$means
## age creatinine_phosphokinase ejection_fraction platelets serum_sodium
## 0 59.01626 545.1159 40.14024 268394.8 137.1707
## 1 65.50223 745.8267 32.73333 254261.1 134.9733
# plot boundaries
partimat(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, method = "lda")
#prediction
death_lda_pred = predict(cvd_lda_fit, cvd_test_df)
death_lda_pred = as.data.frame(death_lda_pred)
#confusion matrix
lda_mx <- table(death_lda_pred$class, cvd_test_df$death)
lda_mx
##
## 0 1
## 0 37 16
## 1 2 5
# metrics_lda = measure.classification(death_lda_pred$class, death_lda_pred$posterior.1, cvd_test_df$death, lda_mx)
metrics_lda = measure.classification(death_lda_pred$class, cvd_test_df$death, lda_mx)
rownames(metrics_lda) = "LDA"
metric_comparison = rbind(metric_comparison, metrics_lda)
cvd_qda_fit = qda(death ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, cvd_train_df)
cvd_qda_fit$means
## age creatinine_phosphokinase ejection_fraction platelets serum_sodium
## 0 59.01626 545.1159 40.14024 268394.8 137.1707
## 1 65.50223 745.8267 32.73333 254261.1 134.9733
# plot boundaries
partimat(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, method = "qda")
#prediction
death_qda_pred = predict(cvd_qda_fit, cvd_test_df)
death_qda_pred = as.data.frame(death_qda_pred)
#confusion matrix
qda_mx <- table(death_qda_pred$class, cvd_test_df$death)
qda_mx
##
## 0 1
## 0 38 19
## 1 1 2
# metrics_qda = measure.classification(death_qda_pred$class, death_qda_pred$posterior.1, cvd_test_df$death, qda_mx)
metrics_qda = measure.classification(death_qda_pred$class, cvd_test_df$death, qda_mx)
rownames(metrics_qda) = "QDA"
metric_comparison = rbind(metric_comparison, metrics_qda)
cvd_svm_linear_fit <- svm(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, kernel = 'linear', probability = TRUE)
# most interesting plots
{
plot(cvd_svm_linear_fit, cvd_train_df, age ~ ejection_fraction)
plot(cvd_svm_linear_fit, cvd_train_df, age ~ creatinine_phosphokinase)
plot(cvd_svm_linear_fit, cvd_train_df, age ~ platelets)
plot(cvd_svm_linear_fit, cvd_train_df, age ~ serum_sodium)
plot(cvd_svm_linear_fit, cvd_train_df, creatinine_phosphokinase ~ ejection_fraction)
plot(cvd_svm_linear_fit, cvd_train_df, ejection_fraction ~ serum_sodium)
}
Age vs Ejection Fraction shows the strongest decision boundary to distinguish the positive and negative classes in absence of a distribution.
death_svm_linear_pred = predict(cvd_svm_linear_fit, cvd_test_df)
svm_linear_mx = table(death_svm_linear_pred, cvd_test_df$death)
svm_linear_mx
##
## death_svm_linear_pred 0 1
## 0 36 14
## 1 3 7
metrics_svm_linear = measure.classification(death_svm_linear_pred, cvd_test_df$death, svm_linear_mx)
rownames(metrics_svm_linear) = "SVM (Linear Kernel)"
metric_comparison = rbind(metric_comparison, metrics_svm_linear)
cvd_svm_poly_fit = svm(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, kernel = 'polynomial', probability = TRUE)
{
plot(cvd_svm_poly_fit, cvd_train_df, age ~ ejection_fraction)
plot(cvd_svm_poly_fit, cvd_train_df, age ~ creatinine_phosphokinase)
plot(cvd_svm_poly_fit, cvd_train_df, age ~ platelets)
plot(cvd_svm_poly_fit, cvd_train_df, age ~ serum_sodium)
plot(cvd_svm_poly_fit, cvd_train_df, creatinine_phosphokinase ~ ejection_fraction)
plot(cvd_svm_poly_fit, cvd_train_df, ejection_fraction ~ serum_sodium)
}
death_svm_poly_pred = predict(cvd_svm_poly_fit, cvd_test_df)
svm_poly_mx = table(death_svm_poly_pred, cvd_test_df$death)
svm_poly_mx
##
## death_svm_poly_pred 0 1
## 0 38 19
## 1 1 2
metrics_svm_poly = measure.classification(death_svm_poly_pred, cvd_test_df$death, svm_poly_mx)
rownames(metrics_svm_poly) = "SVM (Polynomial Kernel)"
metric_comparison = rbind(metric_comparison, metrics_svm_poly)
cvd_svm_rad_fit = svm(as.factor(death) ~ age + creatinine_phosphokinase + ejection_fraction + platelets + serum_sodium, data = cvd_train_df, kernel = 'radial', probability = TRUE)
{
plot(cvd_svm_rad_fit, cvd_train_df, age ~ ejection_fraction)
plot(cvd_svm_rad_fit, cvd_train_df, age ~ creatinine_phosphokinase)
plot(cvd_svm_rad_fit, cvd_train_df, age ~ platelets)
plot(cvd_svm_rad_fit, cvd_train_df, age ~ serum_sodium)
plot(cvd_svm_rad_fit, cvd_train_df, creatinine_phosphokinase ~ ejection_fraction)
plot(cvd_svm_rad_fit, cvd_train_df, ejection_fraction ~ serum_sodium)
}
death_svm_rad_pred = predict(cvd_svm_rad_fit, cvd_test_df)
svm_rad_mx = table(death_svm_rad_pred, cvd_test_df$death)
svm_rad_mx
##
## death_svm_rad_pred 0 1
## 0 37 18
## 1 2 3
metrics_svm_rad = measure.classification(death_svm_rad_pred, cvd_test_df$death, svm_rad_mx)
rownames(metrics_svm_rad) = "SVM (Radial Kernel)"
metric_comparison = rbind(metric_comparison, metrics_svm_rad)
!!! WRITE DISCUSSION HERE !!!
Auckland from 1 Jan 2014 to 30 June 2020. The attributes follow;
year; 2014, …, 2020 month; 1,2,…,12 day; 1,2,… maxT; (Maximum daily temperature in Celsius) 3, 6.7, 4.8 … 6.7
and are presented graphically. We use this as prestudy to model the long term behaviour of daily max temperature.
are observed. We will treat month as a factor variable and assume fixed month effect.
we will introduce the new variable y which it is a vector of cumulative days from 1 Jan 2014. For example, y=1 on 1 Jan 2014, …, y=31 on 31 Jan 2014, …, y=2283 on 30 June 2020.
and maxT
akl_weather_df = read_csv("data/AckTemp.csv")
## Rows: 2283 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): year, month, day, maxT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(akl_weather_df)
## Rows: 2,283
## Columns: 4
## $ year <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014…
## $ month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 2…
## $ maxT <dbl> 21.9, 23.8, 25.6, 24.2, 23.7, 26.0, 22.2, 24.1, 21.9, 22.0, 22.3…
akl_weather_df$datetime <- ymd(paste(akl_weather_df$year, akl_weather_df$month, akl_weather_df$day, sep = "-"))
reference_datetime <- ymd("2014-01-01")
akl_weather_df = akl_weather_df %>%
mutate(y = as.numeric(difftime(akl_weather_df$datetime, reference_datetime, units = "days")) + 1)
akl_weather_df = akl_weather_df[,c("y", "month", "maxT")]
akl_weather_df$month = as.factor(akl_weather_df$month)
glimpse(akl_weather_df)
## Rows: 2,283
## Columns: 3
## $ y <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 2…
## $ month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ maxT <dbl> 21.9, 23.8, 25.6, 24.2, 23.7, 26.0, 22.2, 24.1, 21.9, 22.0, 22.3…
perature using the data in (a). Write reason(s) for your choice of model and degrees of freedom.
plot(maxT ~ y, data = akl_weather_df, xlab = "Days Since 1-1-2014 (Days)", ylab = "Maximum Temperature (C)", main = "Maximum Temperatures in Auckland (1-1-2014 - 30-6-2020)")
There is seasonality and trend on inspection. The seasonality is based on seasons in New Zealand, so we would expect a cycle period of 365 days (1 year).
NOTE: Use CV NOTE: create MSE plot Maybe choose parsim model?